Return to Data Page

A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance on T, as measured by P, improves with E.

from gapminder import gapminder

import pandas as pd
import numpy as np

import statsmodels.formula.api as smf
import math

import matplotlib.pylab as plt
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')

\(\color{darkblue}{\textbf{Supervised}}\)


\(\color{dodgerblue}{\textbf{Linear Regression}}\)

gapminder["logGDP"] = np.log(gapminder["gdpPercap"])
gapminder[["logGDP", "lifeExp"]].head()
     logGDP  lifeExp
0  6.658583   28.801
1  6.710344   30.332
2  6.748878   31.997
3  6.728864   34.020
4  6.606625   36.088
reg_plot = sns.regplot(x = gapminder["logGDP"], y = gapminder["lifeExp"],
            scatter_kws={"color": "black"}, line_kws={"color": "red"})
reg_plot

lm = smf.ols(formula='lifeExp~logGDP', data = gapminder).fit()
print(lm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     3192.
Date:                Sun, 22 May 2022   Prob (F-statistic):               0.00
Time:                        16:49:58   Log-Likelihood:                -5877.2
No. Observations:                1704   AIC:                         1.176e+04
Df Residuals:                    1702   BIC:                         1.177e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -9.1009      1.228     -7.413      0.000     -11.509      -6.693
logGDP         8.4051      0.149     56.500      0.000       8.113       8.697
==============================================================================
Omnibus:                      148.382   Durbin-Watson:                   0.398
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              205.732
Skew:                          -0.698   Prob(JB):                     2.12e-45
Kurtosis:                       3.973   Cond. No.                         55.7
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


gapminder['predictedLifeExp'] = lm.predict()

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gapminder["lifeExp"], y = gapminder["predictedLifeExp"], color = "black")
ax.plot([30, 90], [30, 90], color = "red")
ax.set_xlabel("Expected")
ax.set_ylabel("Predicted")


lm2 = smf.ols(formula='lifeExp~logGDP+year', data = gapminder).fit()
print(lm2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.717
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     2153.
Date:                Sun, 22 May 2022   Prob (F-statistic):               0.00
Time:                        16:50:00   Log-Likelihood:                -5702.1
No. Observations:                1704   AIC:                         1.141e+04
Df Residuals:                    1701   BIC:                         1.143e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -391.0514     19.418    -20.138      0.000    -429.138    -352.965
logGDP         7.7703      0.138     56.273      0.000       7.499       8.041
year           0.1956      0.010     19.702      0.000       0.176       0.215
==============================================================================
Omnibus:                      149.973   Durbin-Watson:                   0.326
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              210.183
Skew:                          -0.699   Prob(JB):                     2.29e-46
Kurtosis:                       4.003   Cond. No.                     2.31e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.31e+05. This might indicate that there are
strong multicollinearity or other numerical problems.


gapminder['predictedLifeExp2'] = lm2.predict()

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gapminder["lifeExp"], y = gapminder["predictedLifeExp2"], color = "black")
ax.plot([30, 90], [30, 90], color = "red")
ax.set_xlabel("Expected")
ax.set_ylabel("Predicted")


\(\color{dodgerblue}{\textbf{Logistic Regression/Classification}}\)

\(\color{darkblue}{\textbf{Unsupervised}}\)


\(\color{dodgerblue}{\textbf{K Means Clustering}}\)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial.distance import cdist, pdist
import matplotlib.cm as cm

gapminder["logPop"] = np.log(gapminder["pop"])
gap_07 = gapminder[gapminder["year"] == 2007]

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["lifeExp"], y = gap_07["logPop"], color = "black")
ax.set_xlabel("Life Expectancy")
ax.set_ylabel("Population (log)")

Optimal Clusters

#Silhouette Score
def silhouette_score_plot(k, data):
    km = KMeans(n_clusters = k, random_state = 42)
    cluster_labels = km.fit_predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)
    return silhouette_avg

scores = pd.DataFrame({"k": [], 
                       "score": []})

for n in range(2, 11): 
    score = silhouette_score_plot(n , gap_07[["lifeExp", "logPop", "gdpPercap"]])
    scores = scores.append(pd.DataFrame({"k": [n], "score": [score]}))


fig, ax = plt.subplots(figsize=(7.5, 5))
ax.plot(scores["k"], scores["score"])
ax.set_xlabel("K")
ax.set_ylabel("Silhouette Coefficient")
ax.set_title("K with max coefficient = " + str(int(scores[scores["score"] == scores["score"].max()]["k"].squeeze())))


#Elbow Method 
sse = pd.DataFrame({"k": [], 
                    "sse": []})

for n in range(1, 11):
    km = KMeans(n_clusters = n, random_state = 42).fit(gap_07[["lifeExp", "logPop", "gdpPercap"]])
    sse = sse.append(pd.DataFrame({"k": [n], "sse": [km.inertia_]}))
    # Inertia = sum of distances of samples to their closest cluster center

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.plot(sse["k"], sse["sse"])
ax.set_xlabel("K")
ax.set_ylabel("SSE")
ax.set_title("Elbow for KMeans Clustering")


Clustering

km = KMeans(random_state = 42, n_clusters = 2) 
result = km.fit(gap_07[["lifeExp", "logPop", "gdpPercap"]])
gap_07["cluster"] = result.labels_

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["gdpPercap"], y = gap_07["lifeExp"], c = gap_07["cluster"])
ax.set_xlabel("GDP Per Capita")
ax.set_ylabel("Life Expectancy")

fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["logPop"], y = gap_07["lifeExp"], c = gap_07["cluster"])
ax.set_xlabel("Population (log)")
ax.set_ylabel("Life Expectancy")

gap_07.groupby("cluster").agg({"lifeExp":"mean",
                               "logPop":"mean",
                               "gdpPercap":"mean"})
           lifeExp     logPop     gdpPercap
cluster                                    
0        79.171486  16.207095  31904.371711
1        63.028523  16.298523   5064.646622

\(\color{darkblue}{\textbf{Reinforcement}}\)


Return to Data Page